Load preprocessed dataset (preprocessing code in data_preprocessing.Rmd)

glue('Number of genes: ', nrow(datExpr), '\n',
     'Number of samples: ', ncol(datExpr), ' (', sum(datMeta$Diagnosis_=='ASD'), ' ASD, ',
     sum(datMeta$Diagnosis_!='ASD'), ' controls)')
## Number of genes: 30112
## Number of samples: 80 (45 ASD, 35 controls)

Dimensionality reduction using PCA

Since there are more genes than samples, we can perform PCA and reduce the dimension from 30K to 80 without losing any information

pca = prcomp(t(datExpr))
datExpr_redDim = pca$x %>% data.frame

Clustering Methods

clusterings = list()



K-means clustering

No recognisable best k, so chose k=4

set.seed(123)
wss = sapply(1:15, function(k) kmeans(datExpr_redDim, k, nstart=25)$tot.withinss)
plot(wss, type='b', main='K-Means Clustering')
best_k = 4
abline(v = best_k, col='blue')

datExpr_k_means = kmeans(datExpr_redDim, best_k, nstart=25)
clusterings[['km']] = datExpr_k_means$cluster



Hierarchical Clustering

Chose k=9 as best number of clusters.

h_clusts = datExpr_redDim %>% dist %>% hclust %>% as.dendrogram
h_clusts %>% plot
abline(h=126, col='blue')

best_k = 9

Clusters seem to be able to separate ASD and control samples pretty well and there are no noticeable patterns regarding sex, age or brain region in any cluster.

Younger ASD samples seem to be more similar to control samples than older ASD ones. Perhaps there’s a relation between age and severity of ASD because diagnosis are more inclusive nowadays, identifying mild autism cases that weren’t recognised before, and most of these new diagnosis are made on kids?

Colors:

  • Diagnosis: Blue=control, Green=ASD

  • Sex: Pink=Female, Blue=Male

  • Brain region: Pink=Frontal, Green=Temporal, Blue=Parietal, Purple=Occipital

  • Age: Purple=youngest, Yellow=oldest

clusterings[['hc']] = cutree(h_clusts, best_k)

create_viridis_dict = function(){
  min_age = datMeta$Age %>% min
  max_age = datMeta$Age %>% max
  viridis_age_cols = viridis(max_age - min_age + 1)
  names(viridis_age_cols) = seq(min_age, max_age)
  
  return(viridis_age_cols)
}
viridis_age_cols = create_viridis_dict()

dend_meta = datMeta[match(labels(h_clusts), rownames(datMeta)),] %>% 
            mutate('Diagnosis' = ifelse(Diagnosis_=='CTL','#008080','#86b300'), # Blue control, Green ASD
                   'Sex' = ifelse(Sex=='F','#ff6666','#008ae6'),                # Pink Female, Blue Male
                   'Region' = case_when(Brain_lobe=='Frontal'~'#F8766D',        # ggplot defaults for 4 colours
                                        Brain_lobe=='Temporal'~'#7CAE00',
                                        Brain_lobe=='Parietal'~'#00BFC4',
                                        Brain_lobe=='Occipital'~'#C77CFF'),
                   'Age' = viridis_age_cols[as.character(Age)]) %>%            # Purple: young, Yellow: old
            dplyr::select(Age, Region, Sex, Diagnosis)
h_clusts %>% set('labels', rep('', nrow(datMeta))) %>% set('branches_k_color', k=best_k) %>% plot
colored_bars(colors=dend_meta)



Consensus Clustering

Chose the best clustering to be with k=6

*The rest of the output plots can be found in the Data/Gandal/consensusClusterng/samples folder



Independent Component Analysis

Following this paper’s guidelines:

  1. Run PCA and keep enough components to explain 60% of the variance (The 9 first components explain 60% of the variance, but decided to keep the first 26 to accumulate 80% of the variance)

  2. Run ICA with that same number of nbComp as principal components kept to then filter them

  3. Select components with kurtosis > 3

  4. Assign obs to genes with FDR<0.01 using the fdrtool package


Not a good method for clustering samples because:

ICA does not perform well with small samples (see Figure 4 of this paper)

Still, it leaves only 14 samples without a cluster

ICA_clusters %>% rowSums %>% table
## .
##  0  1  2  3 
## 14 45 13  8
ICA_clusters %>% mutate(cl_sum=rowSums(.)) %>% as.matrix %>% melt %>% ggplot(aes(Var2,Var1)) + 
  geom_tile(aes(fill=value)) + xlab('Clusters') + ylab('Samples') + 
  theme_minimal() + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + coord_flip()



WGCNA

This method doesn’t work:

Using the PCA reduced expression dataset give a very erratic \(R^2\). Choosing the best power (7) as well as the second best (5) ends up leaving all genes without cluster

best_power = datExpr_redDim %>% t %>% pickSoftThreshold(powerVector = 1:20)
##    Power SFT.R.sq   slope truncated.R.sq  mean.k. median.k.  max.k.
## 1      1  0.00285  0.0763         0.3560 14.30000  1.36e+01 21.6000
## 2      2  0.32900 -0.5700         0.4640  3.94000  3.54e+00  8.0000
## 3      3  0.40400 -0.7830         0.3660  1.37000  1.18e+00  3.4900
## 4      4  0.78500 -0.7560         0.8640  0.55600  4.36e-01  1.6400
## 5      5  0.79500 -0.8980         0.7360  0.25800  1.71e-01  0.8760
## 6      6  0.20800 -2.8400         0.0684  0.13300  6.63e-02  0.5860
## 7      7  0.85500 -1.1400         0.8140  0.07490  2.85e-02  0.4150
## 8      8  0.15000 -2.1200        -0.0894  0.04560  1.33e-02  0.3050
## 9      9  0.06070 -1.6300        -0.0457  0.02940  6.28e-03  0.2310
## 10    10  0.08230 -1.3300        -0.0747  0.01990  2.87e-03  0.1780
## 11    11  0.02750 -0.9650         0.0595  0.01400  1.44e-03  0.1390
## 12    12  0.11900 -2.1600        -0.0125  0.01010  7.67e-04  0.1100
## 13    13  0.07210 -1.5500         0.0236  0.00751  3.98e-04  0.0875
## 14    14  0.05230 -1.4800         0.1530  0.00565  1.99e-04  0.0701
## 15    15  0.11700 -2.2900         0.0592  0.00431  9.48e-05  0.0564
## 16    16  0.12900 -2.3300         0.0131  0.00333  4.56e-05  0.0455
## 17    17  0.16500 -2.5000         0.0236  0.00259  2.28e-05  0.0369
## 18    18  0.08700 -1.8900         0.1140  0.00203  1.15e-05  0.0299
## 19    19  0.09750 -1.9300         0.1030  0.00160  5.83e-06  0.0243
## 20    20  0.15200 -2.3200         0.0343  0.00126  3.04e-06  0.0198

Running WGCNA with power=7

# Best power
network = datExpr_redDim %>% t %>% blockwiseModules(power=best_power$powerEstimate, numericLabels=TRUE)
## Warning in blockwiseModules(., power = best_power$powerEstimate, numericLabels = TRUE): blockwiseModules: mergeCloseModules failed with the following error message:
##      Error in mergeCloseModules(datExpr, colors[gsg$goodGenes], cutHeight = mergeCutHeight,  : 
##   Error in moduleEigengenes(expr = exprData[[set]]$data, colors = setColors,  : 
##   Color levels are empty. Possible reason: the only color is grey and grey module is excluded from the calculation.
## 
##  
## --> returning unmerged colors.

Cluster distribution

table(network$colors)
## 
##  0 
## 80

Running WGCNA with power=5

# 2nd best
network = datExpr_redDim %>% t %>% blockwiseModules(power=5, numericLabels=TRUE)
## Warning in blockwiseModules(., power = 5, numericLabels = TRUE): blockwiseModules: mergeCloseModules failed with the following error message:
##      Error in mergeCloseModules(datExpr, colors[gsg$goodGenes], cutHeight = mergeCutHeight,  : 
##   Error in moduleEigengenes(expr = exprData[[set]]$data, colors = setColors,  : 
##   Color levels are empty. Possible reason: the only color is grey and grey module is excluded from the calculation.
## 
##  
## --> returning unmerged colors.

Cluster distribution

table(network$colors)
## 
##  0 
## 80

Using the original expression dataset, the \(R^2\) threshold is never achieved, altohugh it gets closest at 11, but still doesn’t manage to find any clusters within the data

best_power = datExpr %>% pickSoftThreshold(powerVector = 1:30)
##    Power SFT.R.sq  slope truncated.R.sq mean.k. median.k. max.k.
## 1      1    0.289 247.00         0.1000    76.5      76.7   77.2
## 2      2    0.286 123.00         0.0953    74.1      74.5   75.5
## 3      3    0.282  81.00         0.0898    71.7      72.4   73.8
## 4      4    0.286  61.10         0.0901    69.5      70.3   72.2
## 5      5    0.283  48.60         0.0858    67.3      68.3   70.6
## 6      6    0.280  40.40         0.0812    65.2      66.4   69.0
## 7      7    0.279  34.30         0.0784    63.2      64.5   67.5
## 8      8    0.276  29.80         0.0746    61.3      62.8   66.0
## 9      9    0.273  26.40         0.0701    59.4      61.0   64.6
## 10    10    0.275  24.00         0.0714    57.6      59.4   63.1
## 11    11    0.786   6.89         0.8230    55.9      57.8   61.8
## 12    12    0.759   6.35         0.7990    54.2      56.2   60.4
## 13    13    0.786   5.84         0.8200    52.6      54.7   59.1
## 14    14    0.780   5.40         0.8070    51.1      53.2   57.8
## 15    15    0.729   4.88         0.7500    49.6      51.8   56.6
## 16    16    0.730   4.58         0.7510    48.1      50.5   55.3
## 17    17    0.694   4.35         0.7210    46.7      49.1   54.1
## 18    18    0.695   4.11         0.7210    45.4      47.8   53.0
## 19    19    0.697   3.87         0.7170    44.1      46.6   51.8
## 20    20    0.707   3.68         0.7250    42.8      45.3   50.7
## 21    21    0.729   3.31         0.7880    41.6      44.1   49.6
## 22    22    0.724   3.13         0.7770    40.5      43.0   48.6
## 23    23    0.612   2.94         0.6550    39.3      41.9   47.6
## 24    24    0.626   2.74         0.6780    38.2      40.8   46.5
## 25    25    0.610   2.59         0.6560    37.2      39.7   45.6
## 26    26    0.612   2.50         0.6570    36.1      38.7   44.6
## 27    27    0.614   2.41         0.6570    35.1      37.7   43.6
## 28    28    0.614   2.33         0.6570    34.2      36.7   42.7
## 29    29    0.708   2.15         0.7730    33.3      35.8   41.8
## 30    30    0.756   2.03         0.8180    32.4      34.8   40.9

Running WGCNA with power=11

# Best power
network = datExpr %>% blockwiseModules(power=11, numericLabels=TRUE)
##      mergeCloseModules: less than two proper modules.
##       ..color levels are 1
##       ..there is nothing to merge.

Cluster distribution

table(network$colors)
## 
##  1 
## 80



Gaussian Mixture Models with hard thresholding

Points don’t seem to follow a Gaussian distribution no matter the number of clusters, chose 4 groups following the best k from K-means because the methods are similar

n_clust = datExpr_redDim %>% Optimal_Clusters_GMM(max_clusters=50, criterion='BIC', plot_data=FALSE)
plot(n_clust, type='l', main='Bayesian Information Criterion to choose number of clusters')

best_k = 4 # copying k-means best_k
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM']] = gmm$Log_likelihood %>% apply(1, which.max)

Plot of clusters with their centroids in gray

gmm_points = rbind(datExpr_redDim, setNames(data.frame(gmm$centroids), names(datExpr_redDim)))
gmm_labels = c(clusterings[['GMM']], rep(NA, best_k)) %>% as.factor
ggplotly(gmm_points %>% ggplot(aes(x=PC1, y=PC2, color=gmm_labels)) + geom_point() + theme_minimal())
rm(wss, datExpr_k_means, h_clusts, cc_output, best_k, ICA_output, ICA_clusters_names, 
   signals_w_kurtosis, n_clust, gmm, gmm_points, gmm_labels, network, dend_meta, 
   best_power, c, viridis_age_cols, create_viridis_dict)



Compare clusterings

Using Adjusted Rand Index:

clusters_plus_phenotype = clusterings
clusters_plus_phenotype[['Subject']] = datMeta$Subject_ID
clusters_plus_phenotype[['ASD']] = datMeta$Diagnosis_
clusters_plus_phenotype[['Region']] = datMeta$Brain_lobe
clusters_plus_phenotype[['Sex']] = datMeta$Sex
clusters_plus_phenotype[['Age']] = datMeta$Age

cluster_sim = data.frame(matrix(nrow = length(clusters_plus_phenotype), ncol = length(clusters_plus_phenotype)))
for(i in 1:(length(clusters_plus_phenotype))){
  cluster1 = as.factor(clusters_plus_phenotype[[i]])
  for(j in (i):length(clusters_plus_phenotype)){
    cluster2 = as.factor(clusters_plus_phenotype[[j]])
    cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
  }
}
colnames(cluster_sim) = names(clusters_plus_phenotype)
rownames(cluster_sim) = colnames(cluster_sim)

cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = FALSE, Colv = FALSE, dendrogram = 'none', 
          cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE, 
          cexRow = 1, cexCol = 1, margins = c(7,7))

rm(i, j, cluster1, cluster2, clusters_plus_phenotype, cluster_sim)



Scatter plots

plot_points = datExpr_redDim %>% data.frame() %>% dplyr::select(PC1:PC3) %>%
              mutate(ID = rownames(.),                            Subject_ID = datMeta$Subject_ID,
                     KMeans = as.factor(clusterings[['km']]),     Hierarchical = as.factor(clusterings[['hc']]),
                     Consensus = as.factor(clusterings[['cc']]),  ICA = as.factor(clusterings[['ICA_min']]),
                     GMM = as.factor(clusterings[['GMM']]),
                     Sex = as.factor(datMeta$Sex),                Region = as.factor(datMeta$Brain_lobe), 
                     Diagnosis = as.factor(datMeta$Diagnosis_),   Age = datMeta$Age)
selectable_scatter_plot(plot_points, plot_points)